# Computations
import numpy as np
import pandas as pd
import scipy.stats as stats
# sklearn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn import datasets
from sklearn import metrics
from xgboost import XGBClassifier
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Path = 'telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
Data = pd.read_csv(Path.split(".")[0]+'_STD.csv')
df = Data.drop(columns = ['customer ID'])
Labels_dict = dict(zip([0, 1], ['No', 'Yes']))
Target = 'Churn'
display(Data.head(6).style.hide_index().set_precision(2))
| customer ID | Gender | Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Monthly Charges | Total Charges | Churn | Bank transfer (automatic) | Credit card (automatic) | Electronic check | Mailed check |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7590-VHVEG | -1.01 | -0.44 | 1.03 | -0.65 | -1.28 | -3.05 | -0.85 | -0.29 | -0.10 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -1.16 | -0.99 | 0 | -0.53 | -0.53 | 1.41 | -0.54 |
| 5575-GNVDE | 0.99 | -0.44 | -0.97 | -0.65 | 0.07 | 0.33 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | -0.10 | -0.22 | -0.23 | 0.37 | -1.21 | -0.26 | -0.17 | 0 | -0.53 | -0.53 | -0.71 | 1.84 |
| 3668-QPYBK | 0.99 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | -0.29 | 1.32 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -0.36 | -0.96 | 1 | -0.53 | -0.53 | -0.71 | 1.84 |
| 7795-CFOCW | 0.99 | -0.44 | -0.97 | -0.65 | 0.51 | -3.05 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | 1.31 | -0.22 | -0.23 | 0.37 | -1.21 | -0.75 | -0.20 | 0 | 1.89 | -0.53 | -0.71 | -0.54 |
| 9237-HQITU | -1.01 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | 1.00 | -0.10 | -0.17 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | 0.20 | -0.94 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
| 9305-CDSKC | -1.01 | -0.44 | -0.97 | -0.65 | -0.99 | 0.33 | 1.17 | 1.00 | -0.10 | -0.17 | 1.18 | -0.10 | 1.10 | 1.09 | -0.83 | 0.83 | 1.16 | -0.65 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen', 'Tomato'], TableColors = ['Indigo','GhostWhite'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.1, 0.1, 0.1],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
X = df.drop(columns = Target).values
y = df[Target].values
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
XGBoost implements the Gradient Boosting [4] framework and provides a parallel tree boosting (also known as GBDT, GBM) that solves many data science problems in a fast and accurate way [5].
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def Best_Parm(model, param_dist, Top = None, X = X, y = y, n_splits = 20, scoring = 'precision', H = 600, titleY = .95):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=Test_Size, random_state=42),
n_iter = int(1e3), scoring = scoring, error_score = 0, verbose = 0,
n_jobs = 10, return_train_score = True)
_ = grid.fit(X, y)
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
Table = Table.iloc[:Top,:]
# Table
T = Table.copy()
T['Train Score'] = T['Mean Train Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Train Score'].map(lambda x: ('%.2e' % x))
T['Test Score'] = T['Mean Test Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Test Score'].map(lambda x: ('%.2e' % x))
T['Fit Time'] = T['Mean Fit Time'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Fit Time'].map(lambda x: ('%.2e' % x))
T = T.drop(columns = ['Mean Train Score','STD Train Score','Mean Test Score','STD Test Score','Mean Fit Time','STD Fit Time'])
display(T.head(Top).style.hide_index().background_gradient(subset= ['Rank Test Score'],
cmap=sns.diverging_palette(145, 300, s=60, as_cmap=True)).\
set_properties(subset=['Params'], **{'background-color': 'Indigo', 'color': 'White'}).\
set_properties(subset=['Train Score'], **{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Test Score'], **{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Fit Time'], **{'background-color': 'Linen', 'color': 'Black'}))
# Plot
Grid_Performance_Plot(Table, n_splits = n_splits, H = H, titleY = titleY)
return grid
def Grid_Table(grid):
Table = pd.DataFrame({'Rank Test Score': grid.cv_results_['rank_test_score'],
'Params':[str(s).replace('{', '').replace('}', '').\
replace("'", '') for s in grid.cv_results_['params']],
# Train
'Mean Train Score': grid.cv_results_['mean_train_score'],
'STD Train Score': grid.cv_results_['std_train_score'],
# Test
'Mean Test Score': grid.cv_results_['mean_test_score'],
'STD Test Score': grid.cv_results_['std_test_score'],
# Fit time
'Mean Fit Time': grid.cv_results_['mean_fit_time'],
'STD Fit Time': grid.cv_results_['std_fit_time']})
Table = Table.sort_values('Rank Test Score').reset_index(drop = True)
return Table
def Grid_Performance_Plot(Table, n_splits, H = 550, titleY =.95):
Temp = Table['Mean Train Score']-Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']-Table['STD Test Score'])
L = np.floor((Temp*100- Temp)).min()/100
Temp = Table['Mean Train Score']+Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']+Table['STD Test Score'])
R = np.ceil((Temp*100 + Temp)).max()/100
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('<b>' + 'Train Set' + '<b>', '<b>' + 'Test Set' + '<b>'))
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Train Score'], showlegend=False, marker_color= 'SeaGreen',
error_y=dict(type='data',array=Table['STD Train Score'], visible=True)), 1, 1)
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Test Score'], showlegend=False, marker_color= 'RoyalBlue',
error_y=dict(type='data',array= Table['STD Test Score'], visible=True)), 1, 2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= [L, R])
fig.update_yaxes(title_text="Mean Score", row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', width = 980, height = H,
title={'text': '<b>' + 'RandomizedSearchCV with %i-fold cross validation' % n_splits + '<b>',
'x':0.5, 'y':titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10, Labels = list(Labels_dict.values())):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
def Train_Test_Scores(CM_Train, CM_Test):
CM = [CM_Train, CM_Test]
Sets = ['Train', 'Test']
Colors = ['Green', 'Blue']
for i in range(2):
Header('%s Set' % Sets[i], C = Colors[i])
tn, fp, fn, tp = CM[i].ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (%s) = %.2f' % (Sets[i], Precision))
print('Recall (%s) = %.2f' % (Sets[i], Recall))
print('TPR (%s) = %.2f' % (Sets[i], TPR))
print('TNR (%s) = %.2f' % (Sets[i], TNR))
print('Balanced Accuracy (%s) = %.2f' % (Sets[i], BA))
Line()
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Name = 'XGBoost'
Header('%s with Default Parameters' % Name)
n_splits = 20
XGB = XGBClassifier(tree_method = 'gpu_hist', verbosity = 0)
print('Default Parameters = %s' % XGB.get_params(deep=True))
_ = XGB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(XGB, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 5), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Train_Test_Scores(CM_Train, CM_Test)
XGBoost with Default Parameters ==================================================================== Default Parameters = {'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': 'gpu_hist', 'validate_parameters': None, 'verbosity': 0}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9576 ± 0.0050 | 0.9710 ± 0.0033 | 0.9642 ± 0.0033 | 3622.0000 ± 0.0000 |
| Yes | 0.9164 ± 0.0091 | 0.8810 ± 0.0146 | 0.8983 ± 0.0098 | 1308.0000 ± 0.0000 |
| accuracy | 0.9471 ± 0.0049 | 0.9471 ± 0.0049 | 0.9471 ± 0.0049 | 0.9471 ± 0.0049 |
| macro avg | 0.9370 ± 0.0058 | 0.9260 ± 0.0077 | 0.9313 ± 0.0065 | 4930.0000 ± 0.0000 |
| weighted avg | 0.9467 ± 0.0050 | 0.9471 ± 0.0049 | 0.9467 ± 0.0050 | 4930.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.8351 ± 0.0045 | 0.8776 ± 0.0082 | 0.8558 ± 0.0041 | 1552.0000 ± 0.0000 |
| Yes | 0.6061 ± 0.0146 | 0.5204 ± 0.0170 | 0.5598 ± 0.0120 | 561.0000 ± 0.0000 |
| accuracy | 0.7827 ± 0.0057 | 0.7827 ± 0.0057 | 0.7827 ± 0.0057 | 0.7827 ± 0.0057 |
| macro avg | 0.7206 ± 0.0083 | 0.6990 ± 0.0077 | 0.7078 ± 0.0075 | 2113.0000 ± 0.0000 |
| weighted avg | 0.7743 ± 0.0058 | 0.7827 ± 0.0057 | 0.7772 ± 0.0056 | 2113.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.92 Recall (Train) = 0.88 TPR (Train) = 0.88 TNR (Train) = 0.97 Balanced Accuracy (Train) = 0.93 Test Set =========================================================================================== Precision (Test) = 0.61 Recall (Test) = 0.52 TPR (Test) = 0.52 TNR (Test) = 0.88 Balanced Accuracy (Test) = 0.70 ====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
XGB = XGBClassifier(tree_method = 'gpu_hist', updater = 'grow_gpu_hist', verbosity = 0)
param_dist = dict(eta = [.2, .3, .4], gamma = [0, .5, 1], max_depth = [4, 6, 8],
alpha = [0, .2, .4], grow_policy = ['depthwise', 'lossguide'])
Header('%s with the Best Parameters' % Name)
grid = Best_Parm(model = XGB, param_dist = param_dist, Top = 20, H = 950, titleY =.96)
XGBoost with the Best Parameters ===================================================================
| Rank Test Score | Params | Train Score | Test Score | Fit Time |
|---|---|---|---|---|
| 1 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.2, alpha: 0 | 7.09e-01 ± 8.42e-03 | 6.57e-01 ± 1.79e-02 | 1.78e+00 ± 1.11e-01 |
| 1 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.2, alpha: 0 | 7.09e-01 ± 8.42e-03 | 6.57e-01 ± 1.79e-02 | 1.72e+00 ± 1.10e-01 |
| 3 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.2, alpha: 0.2 | 7.08e-01 ± 6.22e-03 | 6.56e-01 ± 1.76e-02 | 1.66e+00 ± 1.47e-01 |
| 3 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.2, alpha: 0.2 | 7.08e-01 ± 6.22e-03 | 6.56e-01 ± 1.76e-02 | 1.61e+00 ± 9.14e-02 |
| 5 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.2, alpha: 0.4 | 7.08e-01 ± 4.73e-03 | 6.55e-01 ± 1.84e-02 | 1.60e+00 ± 6.34e-02 |
| 5 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.2, alpha: 0.4 | 7.08e-01 ± 4.73e-03 | 6.55e-01 ± 1.84e-02 | 1.70e+00 ± 1.19e-01 |
| 7 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.3, alpha: 0.4 | 7.08e-01 ± 7.25e-03 | 6.55e-01 ± 1.89e-02 | 1.47e+00 ± 8.22e-02 |
| 7 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.3, alpha: 0.4 | 7.08e-01 ± 7.25e-03 | 6.55e-01 ± 1.89e-02 | 1.51e+00 ± 1.10e-01 |
| 9 | max_depth: 4, grow_policy: lossguide, gamma: 0.5, eta: 0.3, alpha: 0.4 | 7.24e-01 ± 8.22e-03 | 6.54e-01 ± 1.56e-02 | 1.68e+00 ± 9.78e-02 |
| 9 | max_depth: 4, grow_policy: depthwise, gamma: 0.5, eta: 0.3, alpha: 0.4 | 7.24e-01 ± 8.22e-03 | 6.54e-01 ± 1.56e-02 | 1.66e+00 ± 1.22e-01 |
| 11 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.3, alpha: 0 | 7.10e-01 ± 7.35e-03 | 6.52e-01 ± 1.80e-02 | 1.51e+00 ± 8.22e-02 |
| 11 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.3, alpha: 0 | 7.10e-01 ± 7.35e-03 | 6.52e-01 ± 1.80e-02 | 1.47e+00 ± 1.02e-01 |
| 13 | max_depth: 4, grow_policy: lossguide, gamma: 0.5, eta: 0.2, alpha: 0 | 7.23e-01 ± 9.72e-03 | 6.52e-01 ± 1.58e-02 | 1.88e+00 ± 1.35e-01 |
| 13 | max_depth: 4, grow_policy: depthwise, gamma: 0.5, eta: 0.2, alpha: 0 | 7.23e-01 ± 9.72e-03 | 6.52e-01 ± 1.58e-02 | 1.85e+00 ± 7.60e-02 |
| 15 | max_depth: 4, grow_policy: lossguide, gamma: 0.5, eta: 0.2, alpha: 0.4 | 7.19e-01 ± 9.97e-03 | 6.52e-01 ± 1.76e-02 | 1.86e+00 ± 9.48e-02 |
| 15 | max_depth: 4, grow_policy: depthwise, gamma: 0.5, eta: 0.2, alpha: 0.4 | 7.19e-01 ± 9.97e-03 | 6.52e-01 ± 1.76e-02 | 1.81e+00 ± 8.68e-02 |
| 17 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.4, alpha: 0.4 | 7.08e-01 ± 7.34e-03 | 6.52e-01 ± 1.90e-02 | 1.42e+00 ± 7.96e-02 |
| 17 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.4, alpha: 0.4 | 7.08e-01 ± 7.34e-03 | 6.52e-01 ± 1.90e-02 | 1.44e+00 ± 1.13e-01 |
| 19 | max_depth: 4, grow_policy: lossguide, gamma: 1, eta: 0.3, alpha: 0.2 | 7.08e-01 ± 6.60e-03 | 6.52e-01 ± 1.88e-02 | 1.80e+00 ± 7.26e-02 |
| 19 | max_depth: 4, grow_policy: depthwise, gamma: 1, eta: 0.3, alpha: 0.2 | 7.08e-01 ± 6.60e-03 | 6.52e-01 ± 1.88e-02 | 1.68e+00 ± 9.93e-02 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('%s with the Best Parameters' % Name)
XGB = XGBClassifier(**grid.best_params_)
print('Default Parameters = %s' % XGB.get_params(deep=True))
_ = XGB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(XGB, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Train_Test_Scores(CM_Train, CM_Test)
XGBoost with the Best Parameters =================================================================== Default Parameters = {'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': 1, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': 4, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'grow_policy': 'depthwise', 'eta': 0.2, 'alpha': 0}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.8780 ± 0.0076 | 0.9295 ± 0.0052 | 0.9030 ± 0.0056 | 3622.0000 ± 0.0000 |
| Yes | 0.7668 ± 0.0174 | 0.6422 ± 0.0244 | 0.6989 ± 0.0203 | 1308.0000 ± 0.0000 |
| accuracy | 0.8533 ± 0.0088 | 0.8533 ± 0.0088 | 0.8533 ± 0.0088 | 0.8533 ± 0.0088 |
| macro avg | 0.8224 ± 0.0119 | 0.7858 ± 0.0135 | 0.8009 ± 0.0129 | 4930.0000 ± 0.0000 |
| weighted avg | 0.8485 ± 0.0096 | 0.8533 ± 0.0088 | 0.8488 ± 0.0094 | 4930.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.8404 ± 0.0051 | 0.8930 ± 0.0075 | 0.8659 ± 0.0039 | 1552.0000 ± 0.0000 |
| Yes | 0.6423 ± 0.0143 | 0.5306 ± 0.0192 | 0.5809 ± 0.0133 | 561.0000 ± 0.0000 |
| accuracy | 0.7968 ± 0.0056 | 0.7968 ± 0.0056 | 0.7968 ± 0.0056 | 0.7968 ± 0.0056 |
| macro avg | 0.7413 ± 0.0082 | 0.7118 ± 0.0086 | 0.7234 ± 0.0081 | 2113.0000 ± 0.0000 |
| weighted avg | 0.7878 ± 0.0060 | 0.7968 ± 0.0056 | 0.7902 ± 0.0058 | 2113.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.77 Recall (Train) = 0.64 TPR (Train) = 0.64 TNR (Train) = 0.93 Balanced Accuracy (Train) = 0.79 Test Set =========================================================================================== Precision (Test) = 0.64 Recall (Test) = 0.53 TPR (Test) = 0.53 TNR (Test) = 0.89 Balanced Accuracy (Test) = 0.71 ====================================================================================================